"""
Executor bindings
"""
from __future__ import annotations
import datetime
import os
import torch
import typing
__all__ = ['BatchingType', 'CommunicationMode', 'CommunicationType', 'Executor', 'ExecutorConfig', 'InflightBatchingStats', 'IterationStats', 'KvCacheConfig', 'KvCacheStats', 'LoraConfig', 'ModelType', 'OutputConfig', 'ParallelConfig', 'PeftCacheConfig', 'PromptTuningConfig', 'Request', 'RequestStage', 'RequestStats', 'RequestStatsPerIteration', 'Response', 'Result', 'SamplingConfig', 'SchedulerConfig', 'SchedulerPolicy', 'SpeculativeDecodingConfig', 'StaticBatchingStats']
class BatchingType:
    """
    Members:
    
      STATIC
    
      INFLIGHT
    """
    INFLIGHT: typing.ClassVar[BatchingType]  # value = <BatchingType.INFLIGHT: 1>
    STATIC: typing.ClassVar[BatchingType]  # value = <BatchingType.STATIC: 0>
    __members__: typing.ClassVar[dict[str, BatchingType]]  # value = {'STATIC': <BatchingType.STATIC: 0>, 'INFLIGHT': <BatchingType.INFLIGHT: 1>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class CommunicationMode:
    """
    Members:
    
      LEADER
    """
    LEADER: typing.ClassVar[CommunicationMode]  # value = <CommunicationMode.LEADER: 0>
    __members__: typing.ClassVar[dict[str, CommunicationMode]]  # value = {'LEADER': <CommunicationMode.LEADER: 0>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class CommunicationType:
    """
    Members:
    
      MPI
    """
    MPI: typing.ClassVar[CommunicationType]  # value = <CommunicationType.MPI: 0>
    __members__: typing.ClassVar[dict[str, CommunicationType]]  # value = {'MPI': <CommunicationType.MPI: 0>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class Executor:
    def __enter__(self) -> typing.Any:
        ...
    def __exit__(self, arg0: typing.Any, arg1: typing.Any, arg2: typing.Any) -> None:
        ...
    @typing.overload
    def __init__(self, model_path: os.PathLike, model_type: ModelType, executor_config: ExecutorConfig) -> None:
        ...
    @typing.overload
    def __init__(self, engine_buffer: str, json_config_str: str, model_type: ModelType, executor_config: ExecutorConfig) -> None:
        ...
    @typing.overload
    def await_responses(self, timeout: datetime.timedelta | None = None) -> list[Response]:
        ...
    @typing.overload
    def await_responses(self, id: int, timeout: datetime.timedelta | None = None) -> list[Response]:
        ...
    @typing.overload
    def await_responses(self, ids: list[int], timeout: datetime.timedelta | None = None) -> list[list[Response]]:
        ...
    def can_enqueue_requests(self) -> bool:
        ...
    def cancel_request(self, id: int = None) -> None:
        ...
    def enqueue_request(self, request: Request) -> int:
        ...
    def enqueue_requests(self, requests: list[Request]) -> list[int]:
        ...
    def get_latest_iteration_stats(self) -> list[IterationStats]:
        ...
    def get_latest_request_stats(self) -> list[RequestStatsPerIteration]:
        ...
    def get_num_responses_ready(self, id: int | None = None) -> int:
        ...
    def shutdown(self) -> None:
        ...
class ExecutorConfig:
    batching_type: BatchingType
    enable_chunked_context: bool
    iter_stats_max_iterations: int
    kv_cache_config: KvCacheConfig
    max_beam_width: int
    normalize_log_probs: bool
    request_stats_max_iterations: int
    scheduler_config: SchedulerConfig
    def __init__(self, max_beam_width: int = 1, scheduler_config: SchedulerConfig = ..., kv_cache_config: KvCacheConfig = ..., enable_chunked_context: bool = False, normalize_log_probs: bool = True, iter_stats_max_iterations: int = 1000, request_stats_max_iterations: int = 0, batching_type: BatchingType = ..., parallel_config: ParallelConfig | None = None, peft_cache_config: PeftCacheConfig = ..., logits_post_processor_map: dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int], None]] | None = None, medusa_choices: list[list[int]] | None = None) -> None:
        ...
    @property
    def logits_post_processor_map(self) -> dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int], None]] | None:
        ...
    @logits_post_processor_map.setter
    def logits_post_processor_map(self, arg1: dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int], None]]) -> None:
        ...
    @property
    def medusa_choices(self) -> list[list[int]] | None:
        ...
    @medusa_choices.setter
    def medusa_choices(self, arg1: list[list[int]]) -> None:
        ...
    @property
    def parallel_config(self) -> ParallelConfig | None:
        ...
    @parallel_config.setter
    def parallel_config(self, arg1: ParallelConfig) -> None:
        ...
    @property
    def peft_cache_config(self) -> PeftCacheConfig | None:
        ...
    @peft_cache_config.setter
    def peft_cache_config(self, arg1: PeftCacheConfig) -> None:
        ...
class InflightBatchingStats:
    micro_batch_id: int
    num_context_requests: int
    num_ctx_tokens: int
    num_gen_requests: int
    num_paused_requests: int
    num_scheduled_requests: int
    def __init__(self) -> None:
        ...
class IterationStats:
    cpu_mem_usage: int
    gpu_mem_usage: int
    inflight_batching_stats: InflightBatchingStats | None
    iter: int
    kv_cache_stats: KvCacheStats | None
    max_num_active_requests: int
    num_active_requests: int
    pinned_mem_usage: int
    static_batching_stats: StaticBatchingStats | None
    timestamp: str
    def __init__(self) -> None:
        ...
class KvCacheConfig:
    def __init__(self, enable_block_reuse: bool = False, max_tokens: int | None = None, max_attention_window: int | None = None, sink_token_length: int | None = None, free_gpu_memory_fraction: float | None = None, host_cache_size: int | None = None, onboard_blocks: bool = True) -> None:
        ...
    @property
    def enable_block_reuse(self) -> bool:
        ...
    @property
    def free_gpu_memory_fraction(self) -> float | None:
        ...
    @property
    def host_cache_size(self) -> int | None:
        ...
    @property
    def max_attention_window(self) -> int | None:
        ...
    @property
    def max_tokens(self) -> int | None:
        ...
    @property
    def onboard_blocks(self) -> bool:
        ...
    @property
    def sink_token_length(self) -> int | None:
        ...
class KvCacheStats:
    free_num_blocks: int
    max_num_blocks: int
    tokens_per_block: int
    used_num_blocks: int
    def __init__(self) -> None:
        ...
class LoraConfig:
    def __init__(self, task_id: int, weights: torch.Tensor | None = None, config: torch.Tensor | None = None) -> None:
        ...
    @property
    def config(self) -> torch.Tensor | None:
        ...
    @property
    def task_id(self) -> int:
        ...
    @property
    def weights(self) -> torch.Tensor | None:
        ...
class ModelType:
    """
    Members:
    
      DECODER_ONLY
    """
    DECODER_ONLY: typing.ClassVar[ModelType]  # value = <ModelType.DECODER_ONLY: 0>
    __members__: typing.ClassVar[dict[str, ModelType]]  # value = {'DECODER_ONLY': <ModelType.DECODER_ONLY: 0>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class OutputConfig:
    exclude_input_from_output: bool
    return_context_logits: bool
    return_generation_logits: bool
    return_log_probs: bool
    def __init__(self, return_log_probs: bool = False, return_context_logits: bool = False, return_generation_logits: bool = False, exclude_input_from_output: bool = False) -> None:
        ...
class ParallelConfig:
    communication_mode: CommunicationMode
    communication_type: CommunicationType
    def __init__(self, communication_type: CommunicationType = ..., communication_mode: CommunicationMode = ..., device_ids: list[int] | None = None, participant_ids: list[int] | None = None) -> None:
        ...
    @property
    def device_ids(self) -> list[int] | None:
        ...
    @device_ids.setter
    def device_ids(self, arg1: list[int]) -> None:
        ...
    @property
    def participant_ids(self) -> list[int] | None:
        ...
    @participant_ids.setter
    def participant_ids(self, arg1: list[int]) -> None:
        ...
class PeftCacheConfig:
    def __init__(self, num_host_module_layer: int = 0, num_device_module_layer: int = 0, optimal_adapter_size: int = 8, max_adapter_size: int = 64, num_put_workers: int = 1, num_ensure_workers: int = 1, num_copy_streams: int = 1, max_pages_per_block_host: int = 24, max_pages_per_block_device: int = 8, device_cache_percent: float | None = None, host_cache_size: int | None = None) -> None:
        ...
    @property
    def device_cache_percent(self) -> float | None:
        ...
    @property
    def host_cache_size(self) -> int | None:
        ...
    @property
    def max_adapter_size(self) -> int:
        ...
    @property
    def max_pages_per_block_device(self) -> int:
        ...
    @property
    def max_pages_per_block_host(self) -> int:
        ...
    @property
    def num_copy_streams(self) -> int:
        ...
    @property
    def num_device_module_layer(self) -> int:
        ...
    @property
    def num_ensure_workers(self) -> int:
        ...
    @property
    def num_host_module_layer(self) -> int:
        ...
    @property
    def num_put_workers(self) -> int:
        ...
    @property
    def optimal_adapter_size(self) -> int:
        ...
class PromptTuningConfig:
    def __init__(self, embedding_table: torch.Tensor) -> None:
        ...
    @property
    def embedding_table(self) -> torch.Tensor:
        ...
class Request:
    output_config: OutputConfig
    sampling_config: SamplingConfig
    streaming: bool
    def __init__(self, input_token_ids: list[int], max_new_tokens: int, streaming: bool = False, sampling_config: SamplingConfig = ..., output_config: OutputConfig = ..., end_id: int | None = None, pad_id: int | None = None, bad_words: list[list[int]] | None = None, stop_words: list[list[int]] | None = None, embedding_bias: torch.Tensor | None = None, speculative_decoding_config: SpeculativeDecodingConfig | None = None, prompt_tuning_config: PromptTuningConfig | None = None, lora_config: LoraConfig | None = None) -> None:
        ...
    @property
    def bad_words(self) -> list[list[int]] | None:
        ...
    @bad_words.setter
    def bad_words(self, arg1: list[list[int]]) -> None:
        ...
    @property
    def embedding_bias(self) -> torch.Tensor | None:
        ...
    @embedding_bias.setter
    def embedding_bias(self, arg1: torch.Tensor) -> None:
        ...
    @property
    def end_id(self) -> int | None:
        ...
    @end_id.setter
    def end_id(self, arg1: int) -> None:
        ...
    @property
    def input_token_ids(self) -> list[int]:
        ...
    @property
    def logits_post_processor_name(self) -> str | None:
        ...
    @logits_post_processor_name.setter
    def logits_post_processor_name(self, arg1: str) -> None:
        ...
    @property
    def lora_config(self) -> LoraConfig | None:
        ...
    @lora_config.setter
    def lora_config(self, arg1: LoraConfig) -> None:
        ...
    @property
    def max_new_tokens(self) -> int:
        ...
    @property
    def pad_id(self) -> int | None:
        ...
    @pad_id.setter
    def pad_id(self, arg1: int) -> None:
        ...
    @property
    def prompt_tuning_config(self) -> PromptTuningConfig | None:
        ...
    @prompt_tuning_config.setter
    def prompt_tuning_config(self, arg1: PromptTuningConfig) -> None:
        ...
    @property
    def speculative_decoding_config(self) -> SpeculativeDecodingConfig | None:
        ...
    @speculative_decoding_config.setter
    def speculative_decoding_config(self, arg1: SpeculativeDecodingConfig) -> None:
        ...
    @property
    def stop_words(self) -> list[list[int]] | None:
        ...
    @stop_words.setter
    def stop_words(self, arg1: list[list[int]]) -> None:
        ...
class RequestStage:
    """
    Members:
    
      QUEUED
    
      CONTEXT_IN_PROGRESS
    
      GENERATION_IN_PROGRESS
    
      GENERATION_COMPLETE
    """
    CONTEXT_IN_PROGRESS: typing.ClassVar[RequestStage]  # value = <RequestStage.CONTEXT_IN_PROGRESS: 1>
    GENERATION_COMPLETE: typing.ClassVar[RequestStage]  # value = <RequestStage.GENERATION_COMPLETE: 3>
    GENERATION_IN_PROGRESS: typing.ClassVar[RequestStage]  # value = <RequestStage.GENERATION_IN_PROGRESS: 2>
    QUEUED: typing.ClassVar[RequestStage]  # value = <RequestStage.QUEUED: 0>
    __members__: typing.ClassVar[dict[str, RequestStage]]  # value = {'QUEUED': <RequestStage.QUEUED: 0>, 'CONTEXT_IN_PROGRESS': <RequestStage.CONTEXT_IN_PROGRESS: 1>, 'GENERATION_IN_PROGRESS': <RequestStage.GENERATION_IN_PROGRESS: 2>, 'GENERATION_COMPLETE': <RequestStage.GENERATION_COMPLETE: 3>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class RequestStats:
    context_prefill_position: int
    id: int
    num_generated_tokens: int
    paused: bool
    scheduled: bool
    stage: RequestStage
    def __init__(self) -> None:
        ...
class RequestStatsPerIteration:
    iter: int
    request_stats: list[RequestStats]
    def __init__(self) -> None:
        ...
class Response:
    @typing.overload
    def __init__(self, request_id: int, error_msg: str) -> None:
        ...
    @typing.overload
    def __init__(self, request_id: int, result: Result) -> None:
        ...
    def has_error(self) -> bool:
        ...
    @property
    def error_msg(self) -> str:
        ...
    @property
    def request_id(self) -> int:
        ...
    @property
    def result(self) -> Result:
        ...
class Result:
    context_logits: torch.Tensor | None
    cum_log_probs: list[float] | None
    generation_logits: torch.Tensor | None
    is_final: bool
    log_probs: list[list[float]] | None
    output_token_ids: list[list[int]]
    def __init__(self) -> None:
        ...
class SamplingConfig:
    def __init__(self, beam_width: int = 1, top_k: int | None = None, top_p: float | None = None, top_p_min: float | None = None, top_p_reset_ids: int | None = None, top_p_decay: float | None = None, random_seed: int | None = None, temperature: float | None = None, min_length: int | None = None, beam_search_diversity_rate: float | None = None, repetition_penalty: float | None = None, presence_penalty: float | None = None, frequency_penalty: float | None = None, length_penalty: float | None = None, early_stopping: int | None = None) -> None:
        ...
    @property
    def beam_search_diversity_rate(self) -> float | None:
        ...
    @property
    def beam_width(self) -> int:
        ...
    @property
    def early_stopping(self) -> int | None:
        ...
    @property
    def frequency_penalty(self) -> float | None:
        ...
    @property
    def length_penalty(self) -> float | None:
        ...
    @property
    def min_length(self) -> int | None:
        ...
    @property
    def presence_penalty(self) -> float | None:
        ...
    @property
    def random_seed(self) -> int | None:
        ...
    @property
    def repetition_penalty(self) -> float | None:
        ...
    @property
    def temperature(self) -> float | None:
        ...
    @property
    def top_k(self) -> int | None:
        ...
    @property
    def top_p(self) -> float | None:
        ...
    @property
    def top_p_decay(self) -> float | None:
        ...
    @property
    def top_p_min(self) -> float | None:
        ...
    @property
    def top_p_reset_ids(self) -> int | None:
        ...
class SchedulerConfig:
    def __init__(self, policy: SchedulerPolicy = ...) -> None:
        ...
    @property
    def policy(self) -> SchedulerPolicy:
        ...
class SchedulerPolicy:
    """
    Members:
    
      MAX_UTILIZATION
    
      GUARANTEED_NO_EVICT
    """
    GUARANTEED_NO_EVICT: typing.ClassVar[SchedulerPolicy]  # value = <SchedulerPolicy.GUARANTEED_NO_EVICT: 1>
    MAX_UTILIZATION: typing.ClassVar[SchedulerPolicy]  # value = <SchedulerPolicy.MAX_UTILIZATION: 0>
    __members__: typing.ClassVar[dict[str, SchedulerPolicy]]  # value = {'MAX_UTILIZATION': <SchedulerPolicy.MAX_UTILIZATION: 0>, 'GUARANTEED_NO_EVICT': <SchedulerPolicy.GUARANTEED_NO_EVICT: 1>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class SpeculativeDecodingConfig:
    def __init__(self, tokens: list[int], logits: torch.Tensor | None = None, acceptance_threshold: float | None = None) -> None:
        ...
    @property
    def acceptance_threshold(self) -> float | None:
        ...
    @property
    def logits(self) -> torch.Tensor | None:
        ...
    @property
    def tokens(self) -> list[int]:
        ...
class StaticBatchingStats:
    empty_gen_slots: int
    num_context_requests: int
    num_ctx_tokens: int
    num_gen_tokens: int
    num_scheduled_requests: int
    def __init__(self) -> None:
        ...
